In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from warnings import filterwarnings 
filterwarnings("ignore")
In [2]:
data=pd.read_csv(r"C:\Users\laxma\Downloads\Fake News Detection Dataset.csv")
In [3]:
data.shape
Out[3]:
(4500, 6)
In [4]:
data.head()
Out[4]:
ID Word_Count Number_of_Sentence Unique_Words Average_Word_Length Label
0 1606 10 4 24 6.176750 1
1 3718 10 8 25 5.826770 1
2 2634 10 7 18 4.619040 1
3 5560 10 6 18 4.961424 1
4 7494 10 4 21 4.114324 1
In [5]:
data.tail()
Out[5]:
ID Word_Count Number_of_Sentence Unique_Words Average_Word_Length Label
4495 1179 41 7 12 6.963924 0
4496 9445 100 5 15 3.136755 1
4497 4149 100 8 18 3.376823 1
4498 9877 85 14 42 5.331393 0
4499 6709 57 6 7 4.312751 0
In [6]:
data.isnull().sum()
Out[6]:
ID                     0
Word_Count             0
Number_of_Sentence     0
Unique_Words           0
Average_Word_Length    0
Label                  0
dtype: int64
In [7]:
data.duplicated().sum()
Out[7]:
0
In [8]:
data.describe()
Out[8]:
ID Word_Count Number_of_Sentence Unique_Words Average_Word_Length Label
count 4500.000000 4500.000000 4500.000000 4500.000000 4500.000000 4500.000000
mean 5469.140000 53.934000 8.934667 24.943333 4.968105 0.329556
std 2599.193059 24.872743 3.407847 11.540708 1.152394 0.470104
min 1002.000000 10.000000 4.000000 5.000000 3.000385 0.000000
25% 3228.750000 35.000000 6.000000 17.000000 3.980553 0.000000
50% 5449.500000 52.000000 9.000000 22.000000 4.906200 0.000000
75% 7706.750000 75.000000 12.000000 33.000000 5.996111 1.000000
max 9999.000000 100.000000 15.000000 50.000000 6.999799 1.000000
In [9]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   4500 non-null   int64  
 1   Word_Count           4500 non-null   int64  
 2   Number_of_Sentence   4500 non-null   int64  
 3   Unique_Words         4500 non-null   int64  
 4   Average_Word_Length  4500 non-null   float64
 5   Label                4500 non-null   int64  
dtypes: float64(1), int64(5)
memory usage: 211.1 KB
In [10]:
data.columns
Out[10]:
Index(['ID', 'Word_Count', 'Number_of_Sentence', 'Unique_Words',
       'Average_Word_Length', 'Label'],
      dtype='object')
In [11]:
#VISUALIZATION
In [12]:
plt.scatter(data['ID'],data['Number_of_Sentence'],color='yellowgreen')
plt.xticks(rotation=90)
plt.show()
In [13]:
fig=px.violin(data,x='Word_Count',y='Unique_Words',color='Word_Count')
fig.show()
In [14]:
plt.bar(data['Label'],data['ID'])
plt.xticks(rotation=90)
plt.show()
In [15]:
plt.figure(figsize=(10,4))
sns.countplot(x='Word_Count', data=data, color='b')
plt.xticks(rotation=90)
plt.show()
In [16]:
plt.figure(figsize=(10,4))
top_car = data['Label'].value_counts().nlargest(10)
sns.countplot(y=data.Label,color='red')
Out[16]:
<AxesSubplot:xlabel='count', ylabel='Label'>
In [17]:
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data, x='Label', y='Word_Count')
plt.xlabel('Label')
plt.ylabel('Word_Count')
plt.show()
In [18]:
sns.barplot(data['Unique_Words'],data['Average_Word_Length'],color='cyan')
plt.xticks(rotation=90)
plt.show()
In [19]:
sns.displot(data["Label"])
Out[19]:
<seaborn.axisgrid.FacetGrid at 0x17cc89784c0>
In [20]:
sns.lineplot(x='Number_of_Sentence', y='Label', data=data)
Out[20]:
<AxesSubplot:xlabel='Number_of_Sentence', ylabel='Label'>
In [21]:
sns.countplot(x='Unique_Words',data=data)
Out[21]:
<AxesSubplot:xlabel='Unique_Words', ylabel='count'>
In [22]:
#MODEL BUILDING
In [23]:
y = data["Word_Count"].values
x_data=data.drop(["Word_Count"],axis=1)
x = (x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=1)
In [24]:
from sklearn.svm import SVC
svm=SVC(random_state=1)
svm.fit(x_train,y_train)
print("train accuracy:",svm.score(x_train,y_train))
print("test accuracy:",svm.score(x_test,y_test))
train accuracy: 0.08583333333333333
test accuracy: 0.05333333333333334
In [ ]:
 
In [ ]: